df_us_covid <- read_csv('timeseries_usa_county_march1_april_09.csv')
df_us_covid <- df_us_covid %>%
filter(time <=31) %>%
arrange(countyfips) %>%
mutate(stabil = -stabil) %>%
dplyr::rename(county_fips = countyfips,
pers_o = open,
pers_c = sci,
pers_e = extra,
pers_a = agree,
pers_n = stabil)
df_us_covid <- df_us_covid %>%
dplyr::select(county_fips, time, mark, rate_day, pers_o,
pers_c, pers_e, pers_a, pers_n)
df_us_covid %>% head()
df_us_ctrl <- read.csv('controls_US.csv')
df_us_ctrl <- df_us_ctrl %>% select(-county_name) %>%
rename(county_fips = county)
df_us_ctrl %>% head()
NA
socdist <- df_us_socdist %>% merge(df_us_socdist_fb, by = c("county_fips", "time"))
socdist[c('daily_distance_diff', 'daily_visitation_diff', 'socdist_tiles', 'socdist_single_tile')] %>%
cor(use = 'pairwise.complete')
daily_distance_diff daily_visitation_diff socdist_tiles socdist_single_tile
daily_distance_diff 1.0000000 0.1361318 0.3061683 -0.2746350
daily_visitation_diff 0.1361318 1.0000000 0.3826102 -0.3624062
socdist_tiles 0.3061683 0.3826102 1.0000000 -0.7544123
socdist_single_tile -0.2746350 -0.3624062 -0.7544123 1.0000000
df_us <- plyr::join_all(list(df_us_covid, df_us_socdist_fb),
by = c('county_fips', 'time'),
type = 'inner') %>%
plyr::join(df_us_ctrl, by='county_fips') %>%
arrange(county_fips, time)
df_us %>% head()
NA
# distribution of observations per county
df_us %>% group_by(county_fips) %>%
summarise(mark = mean(mark)) %>%
ggplot(aes(x=mark)) +
geom_histogram(color="black", fill="white", binwidth = 300) +
ggtitle('Distribution of observations per county')
# distributions of mean prevalence rates per county
df_us %>% group_by(county_fips) %>%
summarise(rate_day = mean(rate_day)) %>%
ggplot(aes(x=rate_day)) +
geom_histogram(color="black", fill="white", binwidth = 0.01) +
ggtitle('Distribution of mean prevalence rates by county')
# distribution of mean sd distance measue
df_us %>% group_by(county_fips) %>%
summarise(socdist_tiles = mean(socdist_tiles)) %>%
ggplot(aes(x=socdist_tiles)) +
geom_histogram(color="black", fill="white", bins = 200) +
ggtitle('Distribution of mean tiles visited measure by county')
# distribution of mean sd visit measue
df_us %>% group_by(county_fips) %>%
summarise(socdist_single_tile = mean(socdist_single_tile)) %>%
ggplot(aes(x=socdist_single_tile)) +
geom_histogram(color="black", fill="white", bins = 200) +
ggtitle('Distribution of mean single tile measute by county')
NA
NA
df_us %>% sample_n(20000) %>%
ggplot(aes(x=time, y=rate_day)) +
geom_point(aes(col=county_name, size=mark)) +
geom_smooth(method="loess", se=T) +
theme(legend.position="none") +
ggtitle("Overall prevalence over time")
pers <- c('pers_o', 'pers_c', 'pers_e', 'pers_a', 'pers_n')
for (i in pers){
gg <- df_us %>% mutate(prev_tail = cut(.[[i]],
breaks = c(-Inf, quantile(.[[i]], 0.05), quantile(.[[i]], 0.95), Inf),
labels = c('lower tail', 'center', 'upper tail'))) %>%
filter(prev_tail != 'center') %>%
ggplot(aes(x=time, y=rate_day)) +
geom_point(aes(col=county_name, size=mark)) +
geom_smooth(method="loess", se=T) +
facet_wrap(~prev_tail) +
theme(legend.position="none") +
ggtitle(i)
print(gg)
}
df_us %>% select(-time, -date, -county_name) %>%
group_by(county_fips) %>%
summarize_if(is.numeric, mean) %>%
select(-county_fips) %>%
cor(use='pairwise.complete.obs') %>%
round(3)
mark rate_day pers_o pers_c pers_e pers_a pers_n socdist_tiles socdist_single_tile
mark 1.000 0.189 0.279 -0.052 0.097 -0.041 -0.174 -0.377 0.214
rate_day 0.189 1.000 0.196 -0.049 0.044 -0.037 -0.094 -0.240 0.167
pers_o 0.279 0.196 1.000 -0.052 -0.086 -0.154 -0.228 -0.249 0.208
pers_c -0.052 -0.049 -0.052 1.000 0.148 0.650 -0.402 0.165 -0.258
pers_e 0.097 0.044 -0.086 0.148 1.000 0.235 -0.386 -0.065 -0.061
pers_a -0.041 -0.037 -0.154 0.650 0.235 1.000 -0.384 0.123 -0.277
pers_n -0.174 -0.094 -0.228 -0.402 -0.386 -0.384 1.000 0.062 0.190
socdist_tiles -0.377 -0.240 -0.249 0.165 -0.065 0.123 0.062 1.000 -0.595
socdist_single_tile 0.214 0.167 0.208 -0.258 -0.061 -0.277 0.190 -0.595 1.000
airport_distance -0.212 -0.055 -0.093 -0.094 -0.109 -0.103 0.040 0.258 -0.135
republican -0.345 -0.234 -0.349 -0.046 -0.077 -0.086 0.307 0.350 -0.231
medage -0.223 -0.075 -0.034 -0.066 -0.092 -0.074 0.232 0.013 0.301
male -0.115 -0.052 -0.117 -0.096 -0.058 -0.154 0.054 0.126 -0.041
popdens 0.322 0.375 0.222 -0.044 0.028 -0.070 -0.044 -0.244 0.221
manufact -0.162 -0.138 -0.386 0.070 0.034 0.119 0.179 0.057 -0.126
tourism 0.112 0.111 0.368 0.017 -0.003 -0.067 -0.188 -0.006 0.074
academics 0.418 0.284 0.462 -0.116 0.141 -0.145 -0.341 -0.434 0.259
medinc 0.307 0.228 0.226 -0.173 0.134 -0.219 -0.215 -0.444 0.225
physician_pc -0.181 -0.100 -0.213 0.107 -0.047 0.112 0.117 0.163 -0.114
airport_distance republican medage male popdens manufact tourism academics medinc
mark -0.212 -0.345 -0.223 -0.115 0.322 -0.162 0.112 0.418 0.307
rate_day -0.055 -0.234 -0.075 -0.052 0.375 -0.138 0.111 0.284 0.228
pers_o -0.093 -0.349 -0.034 -0.117 0.222 -0.386 0.368 0.462 0.226
pers_c -0.094 -0.046 -0.066 -0.096 -0.044 0.070 0.017 -0.116 -0.173
pers_e -0.109 -0.077 -0.092 -0.058 0.028 0.034 -0.003 0.141 0.134
pers_a -0.103 -0.086 -0.074 -0.154 -0.070 0.119 -0.067 -0.145 -0.219
pers_n 0.040 0.307 0.232 0.054 -0.044 0.179 -0.188 -0.341 -0.215
socdist_tiles 0.258 0.350 0.013 0.126 -0.244 0.057 -0.006 -0.434 -0.444
socdist_single_tile -0.135 -0.231 0.301 -0.041 0.221 -0.126 0.074 0.259 0.225
airport_distance 1.000 0.121 0.029 0.194 -0.144 -0.138 0.101 -0.133 -0.177
republican 0.121 1.000 0.134 0.162 -0.264 0.172 -0.220 -0.452 -0.192
medage 0.029 0.134 1.000 -0.040 -0.105 0.091 -0.080 -0.210 -0.107
male 0.194 0.162 -0.040 1.000 -0.101 -0.080 -0.046 -0.175 -0.004
popdens -0.144 -0.264 -0.105 -0.101 1.000 -0.120 0.049 0.242 0.154
manufact -0.138 0.172 0.091 -0.080 -0.120 1.000 -0.380 -0.385 -0.179
tourism 0.101 -0.220 -0.080 -0.046 0.049 -0.380 1.000 0.279 -0.022
academics -0.133 -0.452 -0.210 -0.175 0.242 -0.385 0.279 1.000 0.719
medinc -0.177 -0.192 -0.107 -0.004 0.154 -0.179 -0.022 0.719 1.000
physician_pc -0.038 0.196 0.078 0.159 -0.079 0.157 -0.226 -0.369 -0.202
physician_pc
mark -0.181
rate_day -0.100
pers_o -0.213
pers_c 0.107
pers_e -0.047
pers_a 0.112
pers_n 0.117
socdist_tiles 0.163
socdist_single_tile -0.114
airport_distance -0.038
republican 0.196
medage 0.078
male 0.159
popdens -0.079
manufact 0.157
tourism -0.226
academics -0.369
medinc -0.202
physician_pc 1.000
# function calculates all relevant models
run_models <- function(y, lvl1_x, lvl2_x, lvl2_id, data, ctrls=F){
# subset data
data = data %>%
dplyr::select(all_of(y), all_of(lvl1_x), all_of(lvl2_x), all_of(lvl2_id),
popdens, rate_day, all_of(y))
data = data %>%
dplyr::rename(y = all_of(y),
lvl1_x = all_of(lvl1_x),
lvl2_x = all_of(lvl2_x),
lvl2_id = all_of(lvl2_id)
)
# configure optimization procedure
ctrl_config <- lmeControl(opt = 'optim', maxIter = 100, msMaxIter = 100)
# baseline
baseline <- lme(fixed = y ~ 1, random = ~ 1 | lvl2_id,
data = data,
correlation = corAR1(),
control = ctrl_config,
method = 'ML')
# random intercept fixed slope
random_intercept <- lme(fixed = y ~ lvl1_x + lvl2_x,
random = ~ 1 | lvl2_id,
data = data,
correlation = corAR1(),
control = ctrl_config,
method = 'ML')
# random intercept random slope
random_slope <- lme(fixed = y ~ lvl1_x + lvl2_x,
random = ~ lvl1_x | lvl2_id,
data = data,
correlation = corAR1(),
control = ctrl_config,
method = 'ML')
# cross level interaction
interaction <- lme(fixed = y ~ lvl1_x * lvl2_x,
random = ~ lvl1_x | lvl2_id,
data = data,
correlation = corAR1(),
control = ctrl_config,
method = 'ML')
# create list with results
results <- list('baseline' = baseline,
"random_intercept" = random_intercept,
"random_slope" = random_slope,
"interaction" = interaction)
if (ctrls == 'dem' | ctrls == 'prev'){
# random intercept random slope
random_slope_ctrl_dem <- lme(fixed = y ~ lvl1_x + lvl2_x + popdens,
random = ~ lvl1_x | lvl2_id,
data = data,
correlation = corAR1(),
control = ctrl_config,
method = 'ML')
# cross level interaction
interaction_ctrl_main_dem <- lme(fixed = y ~ lvl1_x * lvl2_x + popdens,
random = ~ lvl1_x | lvl2_id,
data = data,
correlation = corAR1(),
control = ctrl_config,
method = 'ML')
# cross level interaction
interaction_ctrl_int_dem <- lme(fixed = y ~ lvl1_x * lvl2_x + lvl1_x * popdens,
random = ~ lvl1_x | lvl2_id,
data = data,
correlation = corAR1(),
control = ctrl_config,
method = 'ML')
# create list with results
results <- list('baseline' = baseline,
"random_intercept" = random_intercept,
"random_slope" = random_slope,
"interaction" = interaction,
"random_slope_ctrl_dem" = random_slope_ctrl_dem,
"interaction_ctrl_main_dem" = interaction_ctrl_main_dem,
"interaction_ctrl_int_dem" = interaction_ctrl_int_dem)
}
if (ctrls == 'prev'){
# random intercept random slope
random_slope_ctrl_prev <- lme(fixed = y ~ lvl1_x + lvl2_x + popdens + rate_day,
random = ~ lvl1_x + rate_day | lvl2_id,
data = data,
correlation = corAR1(),
control = ctrl_config,
method = 'ML')
# cross level interaction
interaction_ctrl_main_prev <- lme(fixed = y ~ lvl1_x * lvl2_x + popdens + rate_day,
random = ~ lvl1_x | lvl2_id,
data = data,
correlation = corAR1(),
control = ctrl_config,
method = 'ML')
# cross level interaction
interaction_ctrl_int_prev<- lme(fixed = y ~ lvl1_x * lvl2_x + lvl1_x * popdens + rate_day,
random = ~ lvl1_x + rate_day | lvl2_id,
data = data,
correlation = corAR1(),
control = ctrl_config,
method = 'ML')
# create list with results
results <- list('baseline' = baseline,
"random_intercept" = random_intercept,
"random_slope" = random_slope,
"interaction" = interaction,
"random_slope_ctrl_dem" = random_slope_ctrl_dem,
"interaction_ctrl_main_dem" = interaction_ctrl_main_dem,
"interaction_ctrl_int_dem" = interaction_ctrl_int_dem,
"random_slope_ctrl_prev" = random_slope_ctrl_prev,
"interaction_ctrl_main_prev" = interaction_ctrl_main_prev,
"interaction_ctrl_int_prev" = interaction_ctrl_int_prev)
}
if(ctrls == 'exp'){
# random intercept random slope
random_slope_exp <- lme(fixed = y ~ (lvl1_x + I(lvl1_x^2)) + lvl2_x,
random = ~ (lvl1_x + I(lvl1_x^2)) | lvl2_id,
data = data,
correlation = corAR1(),
control = ctrl_config,
method = 'ML')
# cross level interaction
interaction_exp <- lme(fixed = y ~ (lvl1_x + I(lvl1_x^2)) * lvl2_x,
random = ~ (lvl1_x + I(lvl1_x^2)) | lvl2_id,
data = data,
correlation = corAR1(),
control = ctrl_config,
method = 'ML')
# create list with results
results <- list('baseline' = baseline,
"random_intercept" = random_intercept,
"random_slope" = random_slope,
"interaction" = interaction,
"random_slope_exp" = random_slope_exp,
"interaction_exp" = interaction_exp)
}
return(results)
}
# extracts table with coefficients and tests statistics
extract_results <- function(models) {
models_summary <- models %>%
map(summary) %>%
map("tTable") %>%
map(as.data.frame) %>%
map(round, 10)
# %>% map(~ .[str_detect(rownames(.), 'Inter|lvl'),])
return(models_summary)
}
compare_models <- function(models) {
mdl_names <- models %>% names()
str = ''
for (i in mdl_names){
mdl_str <- paste('models$', i, sep = '')
if(i == 'baseline'){
str <- mdl_str
}else{
str <- paste(str, mdl_str, sep=', ')
}
}
anova_str <- paste0('anova(', str, ')')
mdl_comp <- eval(parse(text=anova_str))
rownames(mdl_comp) = mdl_names
return(mdl_comp)
}
lvl2_scaled <- df_us %>%
select(-time, -mark, -date, -county_name, -rate_day,
-socdist_tiles, -socdist_single_tile) %>%
distinct() %>%
mutate_at(vars(-county_fips), scale)
lvl1_scaled <- df_us %>% select(county_fips, time, rate_day, socdist_single_tile) %>%
mutate_at(vars(-county_fips, -time), scale)
df_us_scaled <- plyr::join(lvl1_scaled, lvl2_scaled, by = 'county_fips')
head(df_us_scaled)
models_o_covid <-run_models(y = 'rate_day',
lvl1_x = 'time',
lvl2_x = 'pers_o',
lvl2_id = 'county_fips',
data = df_us_scaled,
ctrls = 'dem')
extract_results(models_o_covid)
$baseline
$random_intercept
$random_slope
$interaction
$random_slope_ctrl_dem
$interaction_ctrl_main_dem
$interaction_ctrl_int_dem
compare_models(models_o_covid)
NA
models_c_covid <-run_models(y = 'rate_day',
lvl1_x = 'time',
lvl2_x = 'pers_c',
lvl2_id = 'county_fips',
data = df_us_scaled,
ctrls = 'dem')
extract_results(models_c_covid)
$baseline
$random_intercept
$random_slope
$interaction
$random_slope_ctrl_dem
$interaction_ctrl_main_dem
$interaction_ctrl_int_dem
compare_models(models_c_covid)
NA
NA
models_e_covid <-run_models(y = 'rate_day',
lvl1_x = 'time',
lvl2_x = 'pers_e',
lvl2_id = 'county_fips',
data = df_us_scaled,
ctrls = 'dem')
extract_results(models_e_covid)
$baseline
$random_intercept
$random_slope
$interaction
$random_slope_ctrl_dem
$interaction_ctrl_main_dem
$interaction_ctrl_int_dem
compare_models(models_e_covid)
NA
NA
models_a_covid <-run_models(y = 'rate_day',
lvl1_x = 'time',
lvl2_x = 'pers_a',
lvl2_id = 'county_fips',
data = df_us_scaled,
ctrls = 'dem')
extract_results(models_a_covid)
$baseline
$random_intercept
$random_slope
$interaction
$random_slope_ctrl_dem
$interaction_ctrl_main_dem
$interaction_ctrl_int_dem
compare_models(models_a_covid)
NA
NA
models_n_covid <-run_models(y = 'rate_day',
lvl1_x = 'time',
lvl2_x = 'pers_n',
lvl2_id = 'county_fips',
data = df_us_scaled,
ctrls = 'dem')
extract_results(models_n_covid)
$baseline
$random_intercept
$random_slope
$interaction
$random_slope_ctrl_dem
$interaction_ctrl_main_dem
$interaction_ctrl_int_dem
compare_models(models_n_covid)
NA
NA
# slope prevalence
df_us_slope_prev <- df_us %>% split(.$county) %>%
map(~ lm(rate_day ~ time, data = .)) %>%
map(coef) %>%
map_dbl('time') %>%
as.data.frame() %>%
rownames_to_column('county_fips') %>%
rename(slope_prev = '.')
df_us_slope_prev <- df_us %>% select(county_fips:pers_n) %>%
distinct() %>%
mutate(county_fips = as.character(county_fips)) %>%
inner_join(df_us_slope_prev, by = 'county_fips') %>%
select(-viocrime, -assn2014, -sk2014, -trade_element_share,
-manag_share, -patents, -married_share, -purewhite_share,
-lifeexp_2010_14, -male_share, -creative_share, -rep2008,
-dem2008, -other2008, -totalpop, -population)
# slope social distancing
df_us_slope_socdist <- df_us %>% split(.$county) %>%
map(~ lm(socdist_tiles ~ time, data = .)) %>%
map(coef) %>%
map_dbl('time') %>%
as.data.frame() %>%
rownames_to_column('county_fips') %>%
rename(slope_socdist = '.')
df_us_slope_socdist <- df_us %>% select(county_fips:pers_n) %>%
distinct() %>%
mutate(county_fips = as.character(county_fips)) %>%
inner_join(df_us_slope_socdist, by = 'county_fips') %>%
select(-viocrime, -assn2014, -sk2014, -trade_element_share,
-manag_share, -patents, -married_share, -purewhite_share,
-lifeexp_2010_14, -male_share, -creative_share, -rep2008,
-dem2008, -other2008, -totalpop, -population)
df_us_slopes_prev %>% ggplot(aes(slope_prev)) + geom_histogram(bins = 100)
df_us_slopes_socdist %>% ggplot(aes(slope_socdist)) + geom_histogram(bins = 100)
ctrls <- cforest_unbiased(ntree=500, mtry=5)
crf_fit_prev <- cforest(slope_prev ~ .,
df_us_slope_prev[-1],
controls = ctrls)
crf_varimp_prev <- varimp(crf_fit_prev, nperm = 5)
crf_varimp_cond_prev <- varimp(crf_fit_prev, conditional = T)
crf_varimp_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
ggplot(aes(x=variable, y=.)) +
geom_bar(stat = 'identity') +
theme(axis.text.x = element_text(angle = 90))
crf_varimp_cond_prev %>% as.data.frame() %>% rownames_to_column('variable') %>%
ggplot(aes(x=variable, y=.)) +
geom_bar(stat = 'identity') +
theme(axis.text.x = element_text(angle = 90))
Social distancing data unacast